Steps in Data Preprocessing These are the steps:
#Importing the Libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor
# Gather Data
boston_dataset = load_boston()
type(boston_dataset)
# boston_dataset
dir(boston_dataset)
print(boston_dataset.DESCR)
print(boston_dataset.data)
print(boston_dataset.feature_names)
print(boston_dataset.filename)
print(boston_dataset.target)
data = pd.DataFrame(data = boston_dataset.data, columns = boston_dataset.feature_names)
data.shape
data.head()
data['PRICE'] = boston_dataset.target
data.head()
data.tail()
data.count()
data.info()
data.isnull()
data.isnull().any()
data.isnull().sum()
prices = data['PRICE']
features = data.drop('PRICE', axis = 1)
prices
features
# train, test split......
X_train, X_test, y_train, y_test = train_test_split(features, prices, test_size=0.2, random_state = 10)
X_train
y_train
X_test
y_test
print("X_train Size :",len(X_train))
print("Y_train Size :",len(y_train))
print("X_test Size :",len(X_test))
print("Y_test Size :",len(y_test))
print("Train Size :", (len(X_train)/len(features))*100)
print("Train Size :", (len(X_test)/len(features))*100)
plt.hist(data['PRICE'])
plt.show()
plt.hist(data['PRICE'], bins = 30)
plt.show()
plt.hist(data['PRICE'], bins = 30, ec ='black', color = '#800000', alpha = 0.5)
plt.xlabel('Price in 1000\'s')
plt.ylabel('Nr of Houses')
plt.show()
sns.distplot(data['PRICE'])
plt.show()
plt.figure(figsize= (10,6))
sns.distplot(data['PRICE'], color = 'red', bins = 50)
plt.show()
plt.figure(figsize= (10,6))
sns.distplot(data['PRICE'], color = 'red', bins = 50, hist = False)
plt.show()
plt.figure(figsize= (10,6))
sns.distplot(data['PRICE'], color = 'red', bins = 50, kde = False)
plt.show()
plt.figure(figsize= (10,6))
plt.hist(data['RM'], ec = 'black', color = 'pink')# removed bins
plt.xlabel('Average Number of Rooms')
plt.ylabel('Nr of Houses')
plt.show()
data['RM'].mean()
plt.figure(figsize= (10,6))
plt.hist(data['RAD'], ec = 'black', color = 'green', alpha = 0.5)
plt.xlabel('Accessibility to Radial Highways')
plt.ylabel('Nr of Houses')
plt.show()
data['RAD'].value_counts()
accessibility = data['RAD'].value_counts()
type(accessibility)
print(accessibility.index)
print(accessibility.values)
# my bar chart
plt.figure(figsize= (10,6))
plt.bar(accessibility.index, accessibility )
plt.show()
plt.figure(figsize= (10,6))
plt.bar(accessibility.index, accessibility, color ='brown', ec = 'blue', width = 0.5)# changing the width, color and ec
plt.xlabel('Accessibility index to Highways')
plt.ylabel('Nr of Houses')
plt.show()
data['CHAS'].value_counts()
plt.figure(figsize= (10,6))
plt.hist(data['CHAS'], ec = 'black', color = 'green', alpha = 0.5)
plt.xlabel('Near Charles River')
plt.ylabel('Nr of Houses')
plt.show()
# Smallest value, Largest Value, Mean Value, Median Value..
data['PRICE'].min()
data['PRICE'].max()
data['PRICE'].mean()
data['PRICE'].median()
data.min()
data.max()
data.mean()
data.median()
data.describe()
data['RM'].describe()
data['PRICE'].corr(data['RM'])
# Price and PTRATIO pupil-teacher ratio by town
# pupil-teacher ratio ???
# pupil - teacher ratio
# pupil/teacher ??? .....21/1........pupil ratio 21
# smaller the pupil - teacher ratio the better?
data['PRICE'].corr(data['PTRATIO'])
data.corr()
type(data.corr())
masker = np.zeros_like(data.corr())
masker
triangle_indices = np.triu_indices_from(masker, k = -2)
masker[triangle_indices] = True
masker
# Heat map
plt.figure(figsize =(16,10))
sns.heatmap(data.corr())
plt.show()
plt.figure(figsize =(10,8))# reduced the size
sns.heatmap(data.corr())
plt.xticks(fontsize = 10)#adjusted the fonts on the x axis
plt.yticks(fontsize = 10)# adjusted the font on the y axis
plt.show()
plt.figure(figsize =(10,8))
sns.heatmap(data.corr(), mask = masker)
plt.xticks(fontsize = 10)
plt.yticks(fontsize = 10)
plt.show()
plt.figure(figsize =(10,8))
sns.heatmap(data.corr(), mask = masker, annot=True)
plt.xticks(fontsize = 10)
plt.yticks(fontsize = 10)
plt.show()
plt.figure(figsize =(16,10))
sns.heatmap(data.corr(), mask = masker, annot=True,annot_kws={'size': 14} )
plt.xticks(fontsize = 10)
plt.yticks(fontsize = 10)
plt.show()
# Advanced Visualisations
nox_dis_corr = round(data['NOX'].corr(data['DIS']), 3)
nox_dis_corr
plt.figure(figsize = (16, 8))
plt.scatter(x=data['DIS'], y =data['NOX'], alpha = 0.5, color = 'green', s= 80)
plt.title(f' Distance from Empy centres versus Pollution, Corr({nox_dis_corr})', fontsize = 25, color ='brown' )
plt.xlabel( 'Distance from Employment Centres...DIS', fontsize =14, color = 'black' )
plt.ylabel( 'Level of Pollutants Nitric Oxide...NOX', fontsize =14, color = 'black' )
plt.show()
price_rad_corr = round(data['PRICE'].corr(data['RAD']), 3)
price_rad_corr
plt.figure(figsize = (16, 8))
plt.scatter(x=data['PRICE'], y =data['RAD'], alpha = 0.5, color = 'green', s= 80)
# plt.title(f' Distance from Empy centres versus Pollution, Corr({nox_dis_corr})', fontsize = 25, color ='brown' )
# plt.xlabel( 'Distance from Employment Centres...DIS', fontsize =14, color = 'black' )
# plt.ylabel( 'Level of Pollutants Nitric Oxide...NOX', fontsize =14, color = 'black' )
plt.show()
price_rad_corr
rad_price_corr = round(data['RAD'].corr(data['PRICE']), 3)
rad_price_corr
plt.figure(figsize = (16, 8))
plt.scatter(x=data['RAD'], y =data['PRICE'], alpha = 0.5, color = 'green', s= 80)
# plt.title(f' Distance from Empy centres versus Pollution, Corr({nox_dis_corr})', fontsize = 25, color ='brown' )
# plt.xlabel( 'Distance from Employment Centres...DIS', fontsize =14, color = 'black' )
# plt.ylabel( 'Level of Pollutants Nitric Oxide...NOX', fontsize =14, color = 'black' )
plt.show()
# using seaborn to plot
sns.set() # resets any previous styling
sns.set_style('dark')
sns.jointplot(x = data['DIS'], y = data['NOX'], color = 'red', joint_kws = {'alpha':0.2})
plt.show()
sns.set() # to reset any previous styling
sns.set_style('whitegrid')# to set style.
plt.figure(figsize = (16, 8))
sns.scatterplot(x=data['DIS'], y =data['NOX'], color = 'red', alpha = 0.5) # joint plot creates scatter plot
plt.show()
sns.set()
sns.set_style('dark')
sns.jointplot(x=data['DIS'], y =data['NOX'], kind = 'hex' ) # kind is hex, darker hexagons in regions of higher density
plt.show()
# TAX and RAD
sns.set() # to reset any previous styling
sns.set_style('dark')# to set style.
sns.jointplot(x=data['TAX'], y =data['RAD'], color = 'darkred', joint_kws = {'alpha' : 0.5} ) # jointplot gives scatter plot
plt.show()
# regression plot...lmplot....
sns.lmplot(x ='TAX', y = 'RAD', data = data)
plt.show()
sns.lmplot(x ='TAX', y = 'RAD', data = data, height = 7)
plt.show()
sns.lmplot(x ='RM', y = 'PRICE', data = data, height =10, aspect = 2, ci= None )
plt.show()
sns.pairplot(data)
plt.show()
sns.pairplot(data, kind = 'reg' , plot_kws = {'line_kws':{'color': 'red'}})
plt.show()
regr = LinearRegression()
regr.fit(X_train, y_train) # I am using the training data to train the model
regr.coef_
regr.intercept_
pd.DataFrame(data = regr.coef_, index = X_train.columns, columns = ['Coef'])
regr.score(X_train, y_train) # score on train data, that the model has seen
regr.score(X_test, y_test) # score on the test data the model has not seen before
plt.figure(figsize= (16,8))
plt.hist(data['PRICE'], bins =50, ec = 'black', color = 'blue', alpha = 0.7)# bins 50...setting color by hexcode
plt.xlabel('Price in 1000\'s')
plt.ylabel('Nr of Houses')
plt.show()
# The plot above has a skew.....Its not balanced by the middle value
# Lets find the skew..
data['PRICE'].skew()
#skew = 0 but skew = 1.1
y_log = np.log(data['PRICE'])
y_log.head()
y_log.tail()
data['PRICE'].skew()
y_log.skew()
sns.set()
plt.figure(figsize=(20,6))
sns.distplot(y_log)
plt.show()
sns.set()
plt.figure(figsize=(20,6))
sns.distplot(y_log)
plt.title(f'Log price: The skew is {round(y_log.skew(),2)}', fontsize = 25)# note the f string used to print the y_log.skew()
plt.xlabel('LOG PRICES ', fontsize =14, color = 'green' )
plt.show()
#plot before log transformation for comparison
skew=round(data['PRICE'].skew(),2)
sns.set()
plt.figure(figsize=(20,6))
sns.distplot(data['PRICE'])
plt.title(f'Actual price: The skew is {skew}', fontsize = 25)
plt.xlabel( 'ACTUAL PRICES ', fontsize =14, color = 'green' )
plt.show()
# doing a scatter plot with LSTAT vs PRICE .....on untransformed data...using sns......
sns.lmplot(x='LSTAT', y='PRICE', data=data, height=10, aspect =2,
scatter_kws={'alpha': 0.6}, line_kws={'color':'darkred'})
plt.show()
# doing a scatter plot with LSTAT vs PRICE .....on log transformed data...using sns......
# transforming the price into log price
data
transformed_data = features
transformed_data['LOG_PRICE'] = y_log
transformed_data
sns.lmplot(x='LSTAT', y='LOG_PRICE', data=transformed_data, height=10, aspect =2,
scatter_kws={'alpha': 0.6}, line_kws={'color':'red'})
plt.show()
prices = np.log(data['PRICE'])
features = data.drop('PRICE', axis=1)
X_train, X_test, y_train, y_test = train_test_split(features, prices,
test_size=0.2, random_state=10)
regr = LinearRegression()
regr.fit(X_train, y_train)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
print('Training data r-squared:', regr.score(X_train, y_train))
print('Test data r-squared:', regr.score(X_test, y_test))
pd.DataFrame(data=regr.coef_, index=X_train.columns, columns=['coef'])
print('Intercept', regr.intercept_)# intercept after log transformation of prices
# CHAS 0.080331.....(got after the prices were log transformed)
# how to retranform?
np.e**0.080331
# using the stats model to get the p value...
results = sm.OLS(y_train, sm.add_constant(X_train)).fit()
results.params
results.pvalues
round(results.pvalues,3)
# from the above INDUS and AGE have p values greater than 0.05
# Multicollinearity....VIF......Variance inflation factor.....
variance_inflation_factor(exog=sm.add_constant(X_train).values, exog_idx=10)
variance_inflation_factor(exog=sm.add_constant(X_train).values, exog_idx=13)
# write a loop for gettig all the VIF...and if any feature has a VIF of > 10.....then I have a pblm of multi collinearity
X_train
X_incl_const = sm.add_constant(X_train)
X_incl_const
# write a for loop to get the VIF of all features .....Home work
variance_inflation_factor(exog=sm.add_constant(X_train).values, exog_idx=13)
X_incl_const.shape[1]
vif = [] # empty list
for i in range (X_incl_const.shape[1]):
vif.append(variance_inflation_factor(exog=sm.add_constant(X_train).values, exog_idx=i))
print(vif)
pd.DataFrame({'coef_name': X_incl_const.columns, 'vif': np.around(vif, 2)})
# BIC Test
# This model is using all the features and the log prices...
# model_1
X_incl_const = sm.add_constant(X_train)
model_1 = sm.OLS(y_train, X_incl_const)
results_1 = model_1.fit()
results_1.params
round(results_1.pvalues,3)
round(results_1.bic,2)
# Another model_2 after dropping the INDUS(high p value) and using the log prices..
X_incl_const = sm.add_constant(X_train)
X_incl_const = X_incl_const.drop(['INDUS'], axis = 1)
model_2 = sm.OLS(y_train, X_incl_const)
results_2 = model_2.fit()
results_2.params
round(results_2.pvalues,3)
round(results_2.bic,2)
# third model_3 after dropping the INDUS and the AGE, and using the log prices..
X_incl_const = sm.add_constant(X_train)
X_incl_const = X_incl_const.drop(['INDUS', 'AGE'],axis = 1)
model_3 = sm.OLS(y_train, X_incl_const)
results_3 = model_3.fit()
results_3.params
round(results_3.pvalues,3)
round(results_3.bic,2)
# Home work is....from the models model_1, model_2, model_3 .....get the results and make a DataFrame like above..
X_incl_const = sm.add_constant(X_train)
model_1 = sm.OLS(y_train, X_incl_const)
results_1 = model_1.fit()
results_1.params
results_1.pvalues
model_1_df = pd.DataFrame({'M1_Coef': results_1.params, 'M1_Pvalues' : round(results_1.pvalues,3)})
model_1_df
X_incl_const = sm.add_constant(X_train)
model_2 = sm.OLS(y_train, X_incl_const.drop(['INDUS'], axis =1))
results_2 = model_2.fit()
results_2.params
results_1.pvalues
model_2_df = pd.DataFrame({'M2_Coef': results_2.params, 'M2_Pvalues' : round(results_2.pvalues,3)})
model_2_df
X_incl_const = sm.add_constant(X_train)
model_3 = sm.OLS(y_train, X_incl_const.drop(['INDUS', "AGE"], axis =1))
results_3 = model_3.fit()
results_3.params
results_3.pvalues
model_3_df = pd.DataFrame({'M3_Coef': results_3.params, 'M3_Pvalues' : round(results_3.pvalues,3)})
model_3_df
frames = [model_1_df, model_2_df,model_3_df]
pd.concat(frames, axis =1)
print(f'BIC WITH ALL:: {results_1.bic}: W/O INDUS {results_2.bic}: W/O INDUS AND AGE {results_3.bic} ')
print(f'RSQD:: WITH ALL {(results_1.rsquared)}: W/O INDUS {results_2.rsquared}: W/O INDUS,AGE {results_3.rsquared}')
prices = np.log(data['PRICE']) # Use log prices
features = data.drop(['PRICE', 'INDUS', 'AGE'], axis=1)# dropping the INDUS , AGE also along with PRICE which has been moved
X_train, X_test, y_train, y_test = train_test_split(features, prices,
test_size=0.2, random_state=10)
# Using Statsmodel
results = sm.OLS(y_train, sm.add_constant(X_train)).fit()# obtaining the regression coeffs including the constant(intercept).
# commented out lines below can be used to look at the residuals.
residuals = y_train - results.fittedvalues
results.resid
# Graph of Actual vs. Predicted Prices
corr = round(y_train.corr(results.fittedvalues), 2)# correlation rounded
plt.figure(figsize=(20,6))
plt.scatter(x=y_train, y=results.fittedvalues, c='navy', alpha=0.6)# gives a scatter plot
plt.plot(y_train, y_train, color='red')#the "ideal regression" line......not a reality
plt.xlabel(' log prices $y _i$', fontsize=20)
plt.ylabel('Prediced log prices $\hat y _i$', fontsize=20)
plt.title(f'Log prices vs Predicted log prices: $y _i$ vs $\hat y_i$ (Corr {corr})', fontsize=20)
plt.show()
# Actual Prices vs Predicted Prices in 1000's of Dollars
plt.figure(figsize=(20,6))
plt.scatter(x=np.e**y_train, y=np.e**results.fittedvalues, c='blue', alpha=0.6)
plt.plot(np.e**y_train, np.e**y_train, color='brown')
plt.xlabel('Actual prices in Dollars $y _i$', fontsize=20)
plt.ylabel('Prediced prices in Dollars $\hat y _i$', fontsize=20)
plt.title(f'Actual vs Predicted prices: $y _i$ vs $\hat y_i$ (Corr {corr})', fontsize=20)
plt.show()
# Residuals vs Predicted values
plt.figure(figsize=(20,6))
plt.scatter(x=results.fittedvalues, y=results.resid, c='navy', alpha=0.6)
plt.xlabel('Predicted log prices $\hat y _i$', fontsize=20)
plt.ylabel('Residuals', fontsize=20)
plt.title('Residuals vs Fitted Values', fontsize=20)
plt.show()
# Mean Squared Error & R-Squared
reduced_log_mse = round(results.mse_resid, 3)# for later use and comparision
reduced_log_rsquared = round(results.rsquared, 3) # for later use and comparision
# Distribution of Residuals (log prices) - checking for normality
resid_mean = round(results.resid.mean(), 3)
resid_skew = round(results.resid.skew(), 3)
print(resid_mean, '\n');
print(resid_skew) # printing the mean and skew, both a close to zero
plt.figure(figsize= (20,6))
sns.distplot(results.resid, color='navy')# using the distplot of seaborn
plt.title(f'Log price model: residuals Skew ({resid_skew}) Mean ({resid_mean})')
plt.show()
# Using the original model with all the features and normal prices generate:
# Plot of actual vs predicted prices (incl. correlation)
# Plot of residuals vs. predicted prices
# Plot of distribution of residuals (incl. skew)
# Analysis of results.
# Original model: normal prices & all features
prices = data['PRICE']
features = data.drop(['PRICE'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(features, prices,
test_size=0.2, random_state=10)
X_incl_const = sm.add_constant(X_train)
model = sm.OLS(y_train, X_incl_const)
results = model.fit()
# Graph of Actual vs. Predicted Prices
corr = round(y_train.corr(results.fittedvalues), 2)
plt.figure(figsize=(20,6))
plt.scatter(x=y_train, y=results.fittedvalues, c='indigo', alpha=0.6)
plt.plot(y_train, y_train, color='cyan')
plt.xlabel('Actual prices 000s $y _i$', fontsize=14)
plt.ylabel('Prediced prices 000s $\hat y _i$', fontsize=14)
plt.title(f'Actual vs Predicted prices: $y _i$ vs $\hat y_i$ (Corr {corr})', fontsize=17)
plt.show()
# Residuals vs Predicted values
plt.figure(figsize=(20,6))
plt.scatter(x=results.fittedvalues, y=results.resid, c='indigo', alpha=0.6)
plt.xlabel('Predicted prices $\hat y _i$', fontsize=14)
plt.ylabel('Residuals', fontsize=14)
plt.title('Residuals vs Fitted Values', fontsize=17)
plt.show()
# Residual Distribution Chart
plt.figure(figsize=(20,6))
resid_mean = round(results.resid.mean(), 3)
resid_skew = round(results.resid.skew(), 3)
sns.distplot(results.resid, color='indigo')
plt.title(f'Residuals Skew ({resid_skew}) Mean ({resid_mean})')
plt.show()
# Mean Squared Error & R-Squared
full_normal_mse = round(results.mse_resid, 3)# for later use
full_normal_rsquared = round(results.rsquared, 3) # for later use
# Model Omitting Key Features using log prices
prices = np.log(data['PRICE'])
features = data.drop(['PRICE', 'INDUS', 'AGE', 'LSTAT', 'RM', 'NOX', 'CRIM'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(features, prices,
test_size=0.2, random_state=10)
X_incl_const = sm.add_constant(X_train)
model = sm.OLS(y_train, X_incl_const)
results = model.fit()
# Graph of Actual vs. Predicted Prices
corr = round(y_train.corr(results.fittedvalues), 2)
plt.scatter(x=y_train, y=results.fittedvalues, c='#e74c3c', alpha=0.6)
plt.plot(y_train, y_train, color='cyan')
plt.xlabel('Actual log prices $y _i$', fontsize=14)
plt.ylabel('Predicted log prices $\hat y _i$', fontsize=14)
plt.title(f'Actual vs Predicted prices with omitted variables: $y _i$ vs $\hat y_i$ (Corr {corr})', fontsize=17)
plt.show()
# Residuals vs Predicted values
plt.scatter(x=results.fittedvalues, y=results.resid, c='#e74c3c', alpha=0.6)
plt.xlabel('Predicted prices $\hat y _i$', fontsize=14)
plt.ylabel('Residuals', fontsize=14)
plt.title('Residuals vs Fitted Values', fontsize=17)
plt.show()
# Mean Squared Error & R-Squared
omitted_var_mse = round(results.mse_resid, 3)# for later use
omitted_var_rsquared = round(results.rsquared, 3)# for later use
pd.DataFrame({'R-Squared': [reduced_log_rsquared, full_normal_rsquared, omitted_var_rsquared],
'MSE': [reduced_log_mse, full_normal_mse, omitted_var_mse]},
index=['Reduced Log Model', 'Full Normal Price Model', 'Omitted Var Model'])
pd.DataFrame({'R-Squared': [reduced_log_rsquared, full_normal_rsquared, omitted_var_rsquared],
'MSE': [reduced_log_mse, full_normal_mse, omitted_var_mse],
'RMSE': np.sqrt([reduced_log_mse, full_normal_mse, omitted_var_mse])},# taking the square root of MSE to get RMSE.
index=['Reduced Log Model', 'Full Normal Price Model', 'Omitted Var Model'])
# use reduced log model...
# Estimated price is 30,000 DOllars for a house...
# calculate the upper bound and lower bound of the prices..
# from the above DF MSE = 0.035
np.sqrt(reduced_log_mse)
2*np.sqrt(reduced_log_mse)
upper_bound = np.log(30) + 2*np.sqrt(reduced_log_mse)
(np.e**upper_bound)*1000
lower_bound = np.log(30) - 2*np.sqrt(reduced_log_mse)
(np.e**lower_bound)*1000